Disclaimer : This is the Markdown file is for Task B

rm(list = ls())
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v stringr 1.4.0
## v tidyr   1.2.0     v forcats 0.5.1
## v readr   2.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::arrange()   masks plyr::arrange()
## x purrr::compact()   masks plyr::compact()
## x dplyr::count()     masks plyr::count()
## x dplyr::failwith()  masks plyr::failwith()
## x dplyr::filter()    masks stats::filter()
## x dplyr::id()        masks plyr::id()
## x dplyr::lag()       masks stats::lag()
## x dplyr::mutate()    masks plyr::mutate()
## x dplyr::rename()    masks plyr::rename()
## x dplyr::summarise() masks plyr::summarise()
## x dplyr::summarize() masks plyr::summarize()
library(ggplot2)
library(ggpubr)
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:plyr':
## 
##     mutate
library(rgdal)
## Warning: package 'rgdal' was built under R version 4.1.3
## Loading required package: sp
## Please note that rgdal will be retired by the end of 2023,
## plan transition to sf/stars/terra functions using GDAL and PROJ
## at your earliest convenience.
## 
## rgdal: version: 1.5-29, (SVN revision 1165M)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.2.1, released 2020/12/29
## Path to GDAL shared files: C:/Users/Pranav/Documents/R/win-library/4.1/rgdal/gdal
## GDAL binary built with GEOS: TRUE 
## Loaded PROJ runtime: Rel. 7.2.1, January 1st, 2021, [PJ_VERSION: 721]
## Path to PROJ shared files: C:/Users/Pranav/Documents/R/win-library/4.1/rgdal/proj
## PROJ CDN enabled: FALSE
## Linking to sp version:1.4-6
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading sp or rgdal.
## Overwritten PROJ_LIB was C:/Users/Pranav/Documents/R/win-library/4.1/rgdal/proj
library(geojsonio)
## Warning: package 'geojsonio' was built under R version 4.1.3
## Registered S3 method overwritten by 'geojsonsf':
##   method        from   
##   print.geojson geojson
## 
## Attaching package: 'geojsonio'
## The following object is masked from 'package:base':
## 
##     pretty
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:lubridate':
## 
##     stamp
## The following object is masked from 'package:ggpubr':
## 
##     get_legend
library(leaflet)
library(reshape)
## Warning: package 'reshape' was built under R version 4.1.3
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:cowplot':
## 
##     stamp
## The following object is masked from 'package:lubridate':
## 
##     stamp
## The following objects are masked from 'package:tidyr':
## 
##     expand, smiths
## The following object is masked from 'package:dplyr':
## 
##     rename
## The following objects are masked from 'package:plyr':
## 
##     rename, round_any
library(raster)
## Warning: package 'raster' was built under R version 4.1.3
## 
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
## 
##     select
library(RColorBrewer)
library(spatialEco)
## 
## Attaching package: 'spatialEco'
## The following object is masked from 'package:raster':
## 
##     shift
## The following object is masked from 'package:dplyr':
## 
##     combine
library(htmltools)

B1. Create a GeoJSON file where each postcode is represented with a latitude, longitude value, together with minimum, maximum, mean and median house price.

Ans: We create a GeoJSON file with the required information

We load the required data. We remove the ID Column from the postcodes data set as it is a redundant column. Then we create a column to store the years of our original dataset.This is done for ease of viewing and grouping data ahead.

setwd("D:\\BSE\\BSE Material\\sem 2\\Data Vis\\Project")
pp_data <- read.csv("ppdata_lite.csv")


# Load file with postcodes and latitude/longitude
ukpostcodes <- read.csv("ukpostcodes.csv", header = TRUE, sep = ',')
#id seems redundant
ukpostcodes <- ukpostcodes[-1]
#selecting only required data
ppdata <- pp_data %>%
  mutate(year = as.POSIXlt(date_of_transfer)$year +1900)

We create a seperate dataframe which would contain postcodes,prices and years.And find the mean,max,min and median for the postcodes available.

## # A tibble: 6 x 5
##   postcode  mean_price median_price max_price min_price
##   <chr>          <dbl>        <dbl>     <int>     <int>
## 1 ""           280211.       135000  44033000       750
## 2 "AL1 1AJ"    203018.       159995    435000    101950
## 3 "AL1 1AR"    300000        300000    350000    250000
## 4 "AL1 1AS"    356667.       285000    500000    285000
## 5 "AL1 1BH"    134500        170000    187000     55500
## 6 "AL1 1BX"    731667.       725000   1200000    270000

We then remove the first row as it contains values starting from 0 which we would not require and throws an error when we merge the data with ukpostcodes. We create a spatial dataset and create the required GeoJSON file.

##   postcode mean_price median_price max_price min_price
## 1  AL1 1AJ   203018.4       159995    435000    101950
## 2  AL1 1AR   300000.0       300000    350000    250000
## 3  AL1 1AS   356666.7       285000    500000    285000
## 4  AL1 1BH   134500.0       170000    187000     55500
## 5  AL1 1BX   731666.7       725000   1200000    270000
## 6  AL1 1BZ   265000.0       265000    390000    140000

B2. Open the GeoJSON file in the GIS application of your choice and colour-code the data to give an overview of areas with high, medium and low median house price. Additionally, you can visualise this information as cloropleths or use shiny and add the information as markers on a map for a more interactive and impressive

Ans: The question requires us to use a GIS application and view areas with high,medium and low median house prices. On further discussion with colleagues, I use the Area shapial data which contains the first( in some cases first two) letters of the postcodes. This allowed in reducing the running time, a major constraining factor while computing on the device used and helped in displaying the spatial data better for the required plot.

We create a dataset this time by first getting the first using the old dataset and then mutating the postcode column by only keeping the first or the first two letters of the postal code. We then as instructed find the mean,median,max and min prices of the houses according to these post codes.

Dataset <- pp_data
Dataset$postcode <- gsub('[[:digit:]]+', '', Dataset$postcode)
Dataset$postcode <- substr(Dataset$postcode,start = 1,stop = 2)
Dataset <- na.omit(Dataset)
Dataset <- Dataset%>%
  group_by(postcode)%>%
  summarise_at(vars(price),list(mean_price = mean,
                                median_price = median,
                                max_price = max,
                                min_price = min))
Dataset <- Dataset[-1,]

head(Dataset)
## # A tibble: 6 x 5
##   postcode mean_price median_price max_price min_price
##   <chr>         <dbl>        <dbl>     <int>     <int>
## 1 "AL"        274660.      214998.  10004563       375
## 2 "B "        134297.      110000   20000000       700
## 3 "BA"        172719.      142000    4025000      3000
## 4 "BB"         82682.       61500    4050000       150
## 5 "BD"        100427.       82000    5875000      2000
## 6 "BH"        193451.      163000   19972500      1000

We get our Area specific spatial data and combine it with the dataset created to obtain a spaital dataset which contains the longitude,latitude, price statistics and postcodes of the data that is to be plotted

Area <- shapefile("shapes/Areas.shp")
class(Area) 
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
Map_data <- merge(Area,Dataset,by.x = 'name',by.y = "postcode")

head(Map_data)
##   name mean_price median_price max_price min_price
## 1   AB         NA           NA        NA        NA
## 2   AL  274660.23     214997.5  10004563       375
## 3    B         NA           NA        NA        NA
## 4   BA  172719.47     142000.0   4025000      3000
## 5   BB   82681.88      61500.0   4050000       150
## 6   BD  100427.21      82000.0   5875000      2000
Map_data <- sp.na.omit(Map_data)
## Deleting rows: 132331333638465152545556646768777986110120121122123124

Before Plotting we need to defined how we will divide our data to show different house prices and if they are low,medium or high. Instead of sticking to three catergories, I have chosen to divide the values into 6 parts based on its percentiles. We then assign colours to these required intervals.

intervals = quantile(Map_data$mean_price, probs = c(0.167,0.33,0.5,0.667,0.833,1),names = F ,na.rm = T)
values <- append(intervals,0,0)
factpal <- colorBin("PRGn", bins = values , domain =Map_data$mean_price)

We then try to display the UK property prices using these intervals based on the postcodes.

mapplot_mean <- leaflet(Map_data) %>% setView(lng=-2, lat=52.2783, zoom = 8) %>%
  addProviderTiles("Stamen.TonerHybrid")  %>%
  addPolygons(fillColor = ~factpal(Map_data$mean_price),weight = 0.2,fillOpacity = 0.5, 
            smoothFactor = 0.2)%>%
  addLegend(pal = factpal, 
            values = Map_data$mean_price, 
            title = "Mean HP data")

mapplot_mean

We now use Median instead of mean, as frequency of different properties might now display the true average value of property prices.Median is better for skewed distributions, so chosing median would gives us a better, more robust and a sensible plot. Median free’s us form the disadvantage of means considering not just the values but also their occurences.

intervals = quantile(Map_data$median_price, probs = c(0.167,0.33,0.5,0.667,0.833,1),names = F ,na.rm = T)
values <- append(intervals,0,0)
factpal <- colorBin("PRGn", bins = values , domain =Map_data$median_price)


mapplot_median <- leaflet(Map_data) %>% setView(lng=-2, lat=52.2783, zoom = 8) %>%
  addProviderTiles("Stamen.TonerHybrid")  %>%
  addPolygons(fillColor = ~factpal(Map_data$median_price),weight = 0.5,fillOpacity = 0.5,
              smoothFactor = 0.25)%>%
  addLegend(pal = factpal, 
            values = Map_data$median_price, 
             title = "Median HP data")
mapplot_median